Libraries

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(sf)
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(raster)
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:tidyr':
## 
##     extract

Load Data

Background image

# Load the merged Satellite image for Kazanlak Valley, Bulgaria
kaz <-brick("../1_Teaching/cds-spatial/data/Kaz.tif")
plotRGB(kaz, stretch= "lin")

crs(kaz)
## CRS arguments:
##  +proj=utm +zone=35 +datum=WGS84 +units=m +no_defs

Prediction data

# Load prediction data as points (left bottom corner of the evaluated cell)
cnne_df <- read_csv("2021-10-25.predictions/results/east/east.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   x = col_double(),
##   y = col_double(),
##   mound_probability = col_double()
## )
# 15334 points (origins in the raster cells)
cnnw_df <- read_csv("2021-10-25.predictions/results/west/west.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   x = col_double(),
##   y = col_double(),
##   mound_probability = col_double()
## )
# 15334 points (origins in the raster cells)
cnn_df <- rbind(cnne_df, cnnw_df)
# 30504 rows

# Check the probabilities of there being a mound in a given cell
hist(cnn_df$mound_probability,
        main = "Probability of mound present in a satellite image gridcell");abline(v=0.6, col="red", lty = 2, lwd = 3)

table(cut(cnn_df$mound_probability, c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)))
## 
##   (0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8] 
##      7865      7710      6983      4757      2243       653       195        58 
## (0.8,0.9]   (0.9,1] 
##        30        10
# ok, so there is an exponential drop-off. 

Make the predictions spatial and convert to grid

### Build a grid of those with 60%+ probability of containing a mound

# Build a grid from all points
#cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")

# Filter predictions to those that have 60+% likelihood of containing a mound
cnn60_sp <- cnn_df %>% 
  filter(mound_probability > 0.59) %>%  #333 observations
  st_as_sf(coords = c("x","y"), crs = 32635)

# Make a grid of 60%+ cells
cnn_grid <- st_make_grid(cnn60_sp, cellsize = 250, what = "polygons")
plot(cnn_grid)

# Add probability data to the 60%+ grid 
#cnnall_datagrid <- st_join(st_sf(cnnall_grid), cnnall_sp)
cnn_grid <- st_join(st_sf(cnn_grid), cnn60_sp)

# Visualize the grid cells with higher probability 
ggplot(cnn_grid) +
  geom_sf(aes(color = mound_probability))

# 333 raster cells are predicted to contain mounds with greater 
# than 60% likelihood. Archaeologists found 773 mounds in fraction of the area

# View the grid 
plotRGB(kaz, stretch= "lin");
plot(cnn_grid$geometry, add = TRUE, border = "white")

Field data

that we will need later for validation

# Bring in survey area to see overall coverage
survey <- st_read("../1_Teaching/cds-spatial/data/KAZ_surveyarea.shp")
## Reading layer `KAZ_surveyarea' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_surveyarea.shp' using driver `ESRI Shapefile'
## Simple feature collection with 98 features and 5 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 352181.7 ymin: 4712923 xmax: 368890.2 ymax: 4733359
## projected CRS:  WGS 84 / UTM zone 35N
plot(survey$geometry)

# Bring in all the mounds observed in the field
mounds <- st_read("../1_Teaching/cds-spatial/data/KAZ_mounds.shp")
## Reading layer `KAZ_mounds' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_mounds.shp' using driver `ESRI Shapefile'
## Simple feature collection with 773 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS:  WGS 84 / UTM zone 35N
plot(mounds)

# Extrapolate rough study area after subtracting outliers
far <- mounds %>% 
  st_is_within_distance(survey, 1500) %>% 
  lengths>0 
farmounds <- which(far==0)  # these are the mounds that are mostly far from survey area

# convex hull of points without outliers
survey_sm <- st_convex_hull(st_union(mounds$geometry[-farmounds]))

# convex hull of survey polygons 
survey_ch <- st_convex_hull(st_union(survey$geometry))

# quickly see the difference in bounding boxes
plot(survey_ch, col = "green");plot(survey_sm, col = "red", alpha=0.5, add = TRUE); plot(survey$geometry, add =TRUE)

Plot the basic data

# View all mounds
plotRGB(kaz, stretch= "lin");
plot(survey_ch, col = "green", add = TRUE);
plot(survey_sm, col = "red", add = TRUE);
plot(survey$geometry, col = "lightyellow", add = TRUE );
plot(mounds$geometry[-farmounds,], add = TRUE, col = "pink");
plot(cnn_grid$geometry, add = TRUE, border = "white")

Validation

Area surveyed in 2009-2011 is ca 85 sq km and contains 773 documented mounds of various shapes and sizes. This area contains 192 grid cells of 250m a side which were allocated 0.6+ probability of containing a mound. Let’s explore how many mounds are outside these grid cells and which mounds were predicted and which not. Bring in mound attributes and check correlation. Does size play a role?

# Bring in mound attributes (n = 773) to aid exploration
mounddata <- read_csv("../1_Teaching/cds-spatial/data/KAZ_mdata.csv")
## Parsed with column specification:
## cols(
##   MoundID = col_double(),
##   Condition = col_double(),
##   Robbed = col_double(),
##   Height = col_double(),
##   LandUse = col_character()
## )
mounds <- mounds %>% 
  left_join(mounddata, by = c("TRAP_Code"="MoundID"))
# Select gridcells that fall within TRAP study area

survey_grids <- st_intersection(survey_ch, cnn_grid)
plot(survey_grids)

# Check spatial overlap between 60%+ grid cells and all 773 mounds 
predicted <- st_intersection(mounds, cnn_grid) 
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
# Which ones areun-predicted? 
`%nin%` = Negate(`%in%`)
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,]

predicted #166 unique features here
## Simple feature collection with 166 features and 11 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4713055 xmax: 366872 ymax: 4729119
## projected CRS:  WGS 84 / UTM zone 35N
## First 10 features:
##     Notes TRAP_Code Cluster      Lat     Long Condition Robbed Height LandUse
## 82  Mound      4066       3 42.55737 25.30552         3      1    5.0 Pasture
## 83  Mound      4067       3 42.56229 25.31209         2      1    2.0 Pasture
## 84  Mound      4068       3 42.56229 25.31277         2      1    5.0 Pasture
## 87  Mound      4071       3 42.56499 25.30981         3      1    1.5 Pasture
## 614 Mound      3354       3 42.58316 25.27124         1      1    0.5 Pasture
## 619 Mound      3406       3 42.58347 25.27130         3      1    2.0 Pasture
## 620 Mound      3407       3 42.58296 25.27126         2      1    1.0 Pasture
## 621 Mound      3408       3 42.58282 25.27120         2      1    2.0 Pasture
## 631 Mound      3701       3 42.58326 25.26997         1      0    5.0 Pasture
## 632 Mound      3702       3 42.58336 25.27074         2      1    4.0 Pasture
##        X1 mound_probability                 geometry
## 82   4748         0.8855298 POINT (360896.8 4713055)
## 83   5499         0.7194230 POINT (361446.8 4713591)
## 84   5499         0.7194230   POINT (361503 4713589)
## 87   5315         0.6406531 POINT (361265.5 4713894)
## 614 14149         0.6249947 POINT (358140.6 4715976)
## 619 14149         0.6249947 POINT (358146.9 4716009)
## 620 14149         0.6249947 POINT (358141.8 4715954)
## 621 14149         0.6249947 POINT (358136.6 4715938)
## 631 14149         0.6249947   POINT (358037 4715989)
## 632 14149         0.6249947 POINT (358100.5 4715999)
unique(predicted$TRAP_Code) # 144 unique mounds
##   [1] 4066 4067 4068 4071 3354 3406 3407 3408 3701 3702 3703 4019 4020 4008 4009
##  [16] 4002 4024 4011 4006 4007 3049 3050 3051 4086 4087 3275 3276 5001 3340 3531
##  [31] 3329 3532 3328 3341 3690 3691 3692 3693 3694 3695 3696 3102 3103 3101 3678
##  [46] 3679 3663 2086 3445 3444 3443 3440 3439 3446 2092 3447 3442 2091 2093 3293
##  [61] 3073 3673 3674 3675 3676 3677 3600 3602 3603 3604 3641 3640 3642 3643 3644
##  [76] 3645 3646 3647 3648 3649 3650 3652 3651 3662 3664 3665 3666 3667 3668 3669
##  [91] 2144 3537 3538 3539 2134 2135 2136 2137 2141 2142 2143 3312 3517 3707 3709
## [106] 3711 3712 3713 3714 2139 2148 2150 2151 2152 2153 2154 2175 3681 3113 1067
## [121] 2184 2192 2193 2194 2196 2186 2187 4108 3621 3625 3627 3141 3140 3616 3617
## [136] 3618 3619 3620 3626 3628 3128 3632 3633 3203 1034 1035
unique(cnn_grid$X1) #332 unique cells
##   [1] 14800 13653     4     5   191  1687  2437  6480 13660 14812 13478  1886
##  [13]    14  3159   202 14261 14817 12548 14263 14818 14238 14460  5581  7517
##  [25]  2255  9762  5956   975  2284  8832  6964   968   420  4908  4909  7999
##  [37]   426  9775  8096  2088   807 11093 11280 12028  6420  5980 10535 11657
##  [49]  5056 13381 11658  2125 14680  6052 11475  4184  7175  7362  3998  4372
##  [61]  6055  9982  4374  4561  4748  5122  5870  6057 11106 10432  3628  4002
##  [73]  4188  4562  4749  4750  4936  4937  5123  5497  5871  6058  8302 13538
##  [85]  1368  3816  4004  4565  4751  5313  5499   639  4005  4192  4379  4566
##  [97]  4753  4007  4194  4754  4941  5315  5502  4382  7849  7852 12847  1955
## [109]  3968  7668  8038  8778  3266 10933  2119 10373  3082  7758  1569  2864
## [121]  4715  5639 14149  5087  4583  3608  7123 10638  6200   466   651   652
## [133]  1392  3981  6202  4774  5336   653   838  3654  3655  3841  3842  4215
## [145]  4216  5337  7394   285  1579  1580  2135  4910  5095  5280  5465  3656
## [157]  4591  5339   842  1027  1211  1212  1396  1397  4541  5651  3657  3844
## [169]  3845  4031  4032  4405  4966  4967  5154  1398  1583  4033  4968  5343
## [181]  3474 10767  2696  3849  4037  5345  5346 12873  4225 10210  3853  3256
## [193]  9362 13061  1984  3106  3481  4976  5163  5350  5537  5724  5911  9465
## [205] 10774  8993 12693 14173  8809 13620 14174  2336 12882 13437  4928   309
## [217]  3487  7414  7789  4190  5115   871  5857   873 14181 14551  1994  6295
## [229] 11345   863 14183  2183  6297  6672 13448  5928  2904  4940  5310  6308
## [241]  6609 11604  1246  3098  1990  3099  5874  6060  6614  6246  6247  6431
## [253]  6250  9579 14206  6993  2000  7920  8105  9029  9584 13839 13840  9316
## [265]  5886  7922 13842 10439  7923  8293 10513  4410  8480  8664  8849  9035
## [277] 10884 11439  4411  7741  8112  8482  8666  8667 12315  8113 12316  7930
## [289]  8670 10889 14775  9412 10891 13112 13851 14037  8118 10523 14752 13300
## [301]  7753  8123  4472  4473   724 14784   726 10717  3728  4289  4475  7943
## [313]  3355  3916  4290  4801  4987 14237  3359  9717 15327  2028  3138  4803
## [325] 13272   920  4806  5177  8877 13501  5733  5549
# 146 unique mounds (of 166 intersecting) appear in the 332 unique (of 380) grids




grids_n_mounds <- predicted %>% 
#  st_drop_geometry() %>% 
  group_by(X1) %>% 
  count()

hist(grids_n_mounds$n, breaks = 40)

grids_n_mounds %>% 
  filter(n>2)
## Simple feature collection with 16 features and 2 fields
## geometry type:  MULTIPOINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4715938 xmax: 358168.9 ymax: 4727092
## projected CRS:  WGS 84 / UTM zone 35N
## First 10 features:
##      X1  n                       geometry
## 1  6993  7 MULTIPOINT ((352481.3 47247...
## 2  7920  6 MULTIPOINT ((353087.2 47250...
## 3  7922 29 MULTIPOINT ((353041.2 47252...
## 4  7923  4 MULTIPOINT ((353034.3 47255...
## 5  8105  6 MULTIPOINT ((353087.2 47250...
## 6  8293 15 MULTIPOINT ((353277 4725598...
## 7  8480  6 MULTIPOINT ((353282.8 47256...
## 8  8482  4 MULTIPOINT ((353488.9 47259...
## 9  8849  3 MULTIPOINT ((353855.3 47256...
## 10 9029 11 MULTIPOINT ((353819.8 47249...
# Gridcells contain between 0 and 29 mounds, which makes sense for the necropolis of little 10m diameter mounds in the NW

Overview of results

# See the predicted mounds over grids with 60% likelihood of mound plus unpredicted mounds

library(mapview)
mapview(unpredicted, color = "orange", alpha= 0.5,
         map.types = c("Esri.WorldImagery", "CartoDB.Positron"))+
  mapview(grids_n_mounds, zcol = "n", at = c(1, 2, 5, 10, 15, 30),
          layer.name = " Predicted mounds per grid")+
  survey_grids

Couple Observations

Look at the NW cluster and see that many mounds sit on the border of polygons, and so get counted twice > ca 20 mounds, so the numbers of predicted are slightly exaggerated In NE we are missing a lot of very large 20m diameter+ mounds that are covered with scrub (look forested) In the south a lot of forest patches and beach boundaries are getting picked up that are false positives All in all the model seems to pick on the bright signatures rather than round shapes and there is an odd combination of large forested lozenges and small bare spots picked up, but somehow round forested images with little excavated (bright) trenches on the south side is not getting picked up.

Further exploration - Height

Let’s see what other factors impact detectability in the CNN

Add un/predicted column to mound data

# Add the information on prediction to mound dataset
mounds <- mounds %>% 
  mutate(predicted60 =  case_when(TRAP_Code %in% predicted$TRAP_Code ~ "yes",
                                  TRAP_Code %nin% predicted$TRAP_Code ~ "no"
                                  ))

# Look at the predicted and un=predicted mounds 
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,]

plot(unpredicted$geometry, col = "red", main = "Mounds predicted (green) and unpredicted (red) by CNN");plot(predicted$geometry, col = "darkgreen", add= TRUE); 

# filter the mounds for largesh and noticeable ones (2+ meters)
large <- mounds %>% 
  filter(Height>=2) # 247 obs

mounds %>% 
  group_by(predicted60) %>% 
  summarize(min = min(Height), max = max(Height))
## `summarise()` ungrouping output (override with `.groups` argument)
## Simple feature collection with 2 features and 3 fields
## geometry type:  MULTIPOINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS:  WGS 84 / UTM zone 35N
## # A tibble: 2 x 4
##   predicted60   min   max                                               geometry
##   <chr>       <dbl> <dbl>                                       <MULTIPOINT [m]>
## 1 no            0      20 ((352505.2 4725060), (352519.3 4725007), (352536.4 47~
## 2 yes           0.1    10 ((352481.3 4724776), (352488.8 4724747), (352489.3 47~
# Check the height difference between un- and predicted mounds
boxplot(Height~predicted60, data = mounds)

# Check height distribution btw un- and predicted mounds
hist(mounds$Height[mounds$predicted60 == "no"], col = "pink", breaks  = 20);
hist(mounds$Height[mounds$predicted60 == "yes"], col = "yellow", add =TRUE, alpha = 0.5)

T-test on height

# Is there a significant relationship between the Height of a mound and its detection?
t.test(Height ~ as.factor(predicted60), data = mounds)
## 
##  Welch Two Sample t-test
## 
## data:  Height by as.factor(predicted60)
## t = 4.2302, df = 289.8, p-value = 3.136e-05
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  0.3825377 1.0482416
## sample estimates:
##  mean in group no mean in group yes 
##          1.877033          1.161644
str(t.test(Height ~ as.factor(predicted60), data = mounds))
## List of 10
##  $ statistic  : Named num 4.23
##   ..- attr(*, "names")= chr "t"
##  $ parameter  : Named num 290
##   ..- attr(*, "names")= chr "df"
##  $ p.value    : num 3.14e-05
##  $ conf.int   : num [1:2] 0.383 1.048
##   ..- attr(*, "conf.level")= num 0.95
##  $ estimate   : Named num [1:2] 1.88 1.16
##   ..- attr(*, "names")= chr [1:2] "mean in group no" "mean in group yes"
##  $ null.value : Named num 0
##   ..- attr(*, "names")= chr "difference in means between group no and group yes"
##  $ stderr     : num 0.169
##  $ alternative: chr "two.sided"
##  $ method     : chr "Welch Two Sample t-test"
##  $ data.name  : chr "Height by as.factor(predicted60)"
##  - attr(*, "class")= chr "htest"
# ... the answer seems to be yes with p at 3.14e-05

Side-by-side static visualisations

par(mfrow = c(1,2))

# Look at the predicted and un=predicted mounds 
plotRGB(kaz, stretch= "lin", main = "Predicted vs unpredicted mounds");
plot(mounds$geometry[mounds$predicted60 == "no"], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted60 == "yes"], add = TRUE, col = "green")
plot(cnn_grid$geometry, add = TRUE, border = "white")
legend(x = "bottom",          # Position
       legend = c("undetected", "detected",
                  "grids with 60%+ mound probability"),  # Legend texts
       pch = c(1,1,0),       # Symbol shapes
       col = c("red","green", "lightgrey"))           # Line colors
     

# View Large mounds only
plotRGB(kaz, stretch= "lin", main = "Un/Predicted mounds of 2m+ height");
plot(mounds$geometry[mounds$predicted60 == "no" & mounds$Height >=2], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted60 == "yes"& mounds$Height >=2], add = TRUE, col = "green")
plot(cnn_grid$geometry, add = TRUE, border = "white")
legend(x = "bottom",          # Position
       legend = c("unpredicted 2m+ mounds", "predicted 2m+ mounds",
                  "grids with 60%+ mound probability"),  # Legend texts
       pch = c(1,1,0),       # Symbol shapes
       col = c("red","green", "lightgrey"))           # Line colors

# See the predicted 146/166 mound features of all sizes 
cnn_grid %>% 
 # filter(mound_probability>0.5) %>% 
  ggplot()+
  geom_sf(aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  geom_sf(data = mounds$geometry[mounds$predicted60 == "yes"], size = mounds$Height[mounds$predicted60 == "yes"], col = "lightgrey", alpha = 0.5) +
  ggtitle("Predicted mounds in Kazanlak")

# See the unpredicted 627 mounds 
cnn_grid %>% 
  # filter(mound_probability>0.5) %>% 
  ggplot()+
  geom_sf(aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  #geom_sf(data = unpredicted$geometry)+
  geom_sf(data = mounds$geometry[mounds$predicted60 == "no"], size = mounds$Height[mounds$predicted60 == "no"], col = "lightgrey", alpha = 0.5)

  ggtitle("unpredicted mounds in Kazanlak")
## $title
## [1] "unpredicted mounds in Kazanlak"
## 
## attr(,"class")
## [1] "labels"

Other factors??

Differentiate predictions by other criteria - perhaps RS dataset visibility in 2001 IKONOS?

Create a segregated density image of mound presence?

Biggest ommisions (false negatives) appear in NE where large mounds are present and biggest overcommits (false positives) are south of the reservoir, where there are morphological features but few mounds.